Start with trimmed set of columns of ps_performance and concordance results visualized mainly in Tibco Spotfire file: /git_repositories/DataScienceCapstoneTwo/spotfire/data_cleaning_step2_EDA.dxp
This file has removed most of the unnecessary columns and highly correlated columns to remaining metrics.
Do some basic visualization
import pandas as pd
import seaborn as sns
ps_data = pd.read_csv('../data/data_cleaning_step2.zip',sep='\t')
ps_data.head(7).transpose()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | |
|---|---|---|---|---|---|---|---|
| probeset_id | AX-100003653 | AX-100004573 | AX-100004941 | AX-100006840 | AX-100007392 | AX-100007701 | AX-100008742 |
| quality_bin | high | low | high | low | high | low | marginal |
| quality_score | 4.463 | 0.733 | 5.0 | 2.638 | 4.68 | 1.968 | 4.12 |
| OriginalCT.recommended | True | False | True | True | True | True | False |
| OriginalCT | PolyHighResolution | Other | MonoHighResolution | PolyHighResolution | PolyHighResolution | PolyHighResolution | Other |
| CC | 0.996 | 0.953 | 1.0 | 0.975 | 0.996 | 0.963 | 0.989 |
| CR | 98.913 | 97.464 | 100.0 | 98.188 | 100.0 | 99.638 | 100.0 |
| FLD | 6.761 | 3.694 | NaN | 4.352 | 6.359 | 7.161 | NaN |
| HetSO | 0.745 | -0.28 | NaN | 0.421 | 0.36 | 0.161 | NaN |
| Nclus | 3 | 2 | 1 | 3 | 3 | 3 | 1 |
| MMD | 36.317 | NaN | NaN | 29.674 | 47.444 | 50.152 | NaN |
| het_frac | 0.238095 | 0.026022 | 0.0 | 0.092251 | 0.105072 | 0.425455 | 0.0 |
| MinorAlleleFrequency | 0.145 | 0.013 | 0.0 | 0.061 | 0.056 | 0.34 | 0.0 |
| H.W.p-Value | 0.469615 | 1.0 | 1.0 | 0.010957 | 0.590233 | 0.388351 | 1.0 |
| AA.meanX.clean | 2.649 | NaN | NaN | 2.14 | 2.31 | 1.827 | NaN |
| AB.meanX.abs_clean | 0.283 | 0.8 | NaN | 0.6 | 0.095 | 0.362 | NaN |
| BB.meanX.clean | -2.685 | 0.05 | -1.332 | -1.778 | -2.964 | -2.128 | -0.366 |
| HomRO | 2.649 | -0.05 | 1.332 | 1.778 | 2.31 | 1.827 | 0.366 |
| AA.meanY.clean | 9.449 | NaN | NaN | 9.366 | 10.681 | 10.974 | NaN |
| AB.meanY.clean | 10.155 | 10.206 | NaN | 9.599 | 11.027 | 11.212 | NaN |
| BB.meanY.clean | 9.36 | 10.534 | 10.195 | 8.886 | 10.647 | 11.115 | 9.107 |
| meanY | 9.615 | 10.526 | 10.195 | 8.959 | 10.687 | 11.138 | 9.107 |
| Hom.meanY.delta | 0.089 | 0.21 | 0.133 | 0.48 | 0.035 | 0.141 | 0.605 |
| AA.varX.clean | 0.379 | NaN | NaN | 0.03 | 0.03 | 0.244 | NaN |
| AB.varX.clean | 0.204 | 0.112 | NaN | 0.313 | 0.275 | 0.168 | NaN |
| BB.varX.clean | 0.411 | 0.197 | 0.199 | 0.344 | 0.378 | 0.298 | 0.242 |
| AA.varY.clean | 0.253 | NaN | NaN | 0.03 | 0.03 | 0.172 | NaN |
| AB.varY.clean | 0.192 | 0.271 | NaN | 0.257 | 0.151 | 0.218 | NaN |
| BB.varY.clean | 0.25 | 0.216 | 0.128 | 0.26 | 0.243 | 0.175 | 0.199 |
ps_data.dtypes
probeset_id object quality_bin object quality_score float64 OriginalCT.recommended bool OriginalCT object CC float64 CR float64 FLD float64 HetSO float64 Nclus int64 MMD float64 het_frac float64 MinorAlleleFrequency float64 H.W.p-Value float64 AA.meanX.clean float64 AB.meanX.abs_clean float64 BB.meanX.clean float64 HomRO float64 AA.meanY.clean float64 AB.meanY.clean float64 BB.meanY.clean float64 meanY float64 Hom.meanY.delta float64 AA.varX.clean float64 AB.varX.clean float64 BB.varX.clean float64 AA.varY.clean float64 AB.varY.clean float64 BB.varY.clean float64 dtype: object
# Some metrics are only computed for certain categories of probeset,
missing = pd.concat([ps_data.isnull().sum(), 100 * ps_data.isnull().mean()], axis=1)
missing.columns=['count_missing', 'frac_missing']
missing.sort_values(by='count_missing',ascending = False)
| count_missing | frac_missing | |
|---|---|---|
| MMD | 549639 | 68.011582 |
| AA.meanX.clean | 340804 | 42.170623 |
| AA.varY.clean | 340804 | 42.170623 |
| AA.varX.clean | 340804 | 42.170623 |
| AA.meanY.clean | 340804 | 42.170623 |
| BB.varX.clean | 208499 | 25.799383 |
| BB.meanY.clean | 208499 | 25.799383 |
| BB.meanX.clean | 208499 | 25.799383 |
| BB.varY.clean | 208499 | 25.799383 |
| FLD | 30841 | 3.816223 |
| AB.meanY.clean | 30812 | 3.812635 |
| AB.varY.clean | 30812 | 3.812635 |
| AB.varX.clean | 30812 | 3.812635 |
| AB.meanX.abs_clean | 30812 | 3.812635 |
| HetSO | 30812 | 3.812635 |
| HomRO | 29 | 0.003588 |
| meanY | 0 | 0.000000 |
| quality_score | 0 | 0.000000 |
| OriginalCT.recommended | 0 | 0.000000 |
| OriginalCT | 0 | 0.000000 |
| Hom.meanY.delta | 0 | 0.000000 |
| Nclus | 0 | 0.000000 |
| CC | 0 | 0.000000 |
| CR | 0 | 0.000000 |
| quality_bin | 0 | 0.000000 |
| H.W.p-Value | 0 | 0.000000 |
| MinorAlleleFrequency | 0 | 0.000000 |
| het_frac | 0 | 0.000000 |
| probeset_id | 0 | 0.000000 |
cols = ps_data.columns
cols
Index(['probeset_id', 'quality_bin', 'quality_score', 'OriginalCT.recommended',
'OriginalCT', 'CC', 'CR', 'FLD', 'HetSO', 'Nclus', 'MMD', 'het_frac',
'MinorAlleleFrequency', 'H.W.p-Value', 'AA.meanX.clean',
'AB.meanX.abs_clean', 'BB.meanX.clean', 'HomRO', 'AA.meanY.clean',
'AB.meanY.clean', 'BB.meanY.clean', 'meanY', 'Hom.meanY.delta',
'AA.varX.clean', 'AB.varX.clean', 'BB.varX.clean', 'AA.varY.clean',
'AB.varY.clean', 'BB.varY.clean'],
dtype='object')
#select columns to pairwise plot. Drop columns with only 2 or 3 values
#cols_plot = cols.drop(['OriginalCT.recommended','CC','Nclus'])[::-1] # reverse columns
cols_plot = cols.drop(['CC','Nclus'])
cols_plot
Index(['probeset_id', 'quality_bin', 'quality_score', 'OriginalCT.recommended',
'OriginalCT', 'CR', 'FLD', 'HetSO', 'MMD', 'het_frac',
'MinorAlleleFrequency', 'H.W.p-Value', 'AA.meanX.clean',
'AB.meanX.abs_clean', 'BB.meanX.clean', 'HomRO', 'AA.meanY.clean',
'AB.meanY.clean', 'BB.meanY.clean', 'meanY', 'Hom.meanY.delta',
'AA.varX.clean', 'AB.varX.clean', 'BB.varX.clean', 'AA.varY.clean',
'AB.varY.clean', 'BB.varY.clean'],
dtype='object')
# dropna=False (default) because this drops rows if at least one plotted variable is NA, and have much missing data
# corner=False (default) don't only plot lower triangle to maybe speed up, because harder to scan through
#_ = sns.pairplot(ps_data[cols_plot].sample(frac=0.02),corner=True,diag_kind='kde',dropna=False)
#_ = sns.pairplot(ps_data[cols_plot].sample(frac=0.02),diag_kind='kde')
_ = sns.pairplot(ps_data[cols_plot].sample(n=10000),diag_kind='kde',hue='OriginalCT.recommended')